# Checks Amsterdam data from Stoiber, Thurner, & Pappi
# against EP IGC data from Koenig.
# Lists discrepancies to investigate further
# Creates a data file which fills in EP missing values with STP
# if EP missing and STP not missing
# Jonathan Slapin
# February 13, 2005
# Rev. 4/18/2006

rm(list=ls(all=TRUE))  # clears everything in R's memory

library(foreign)       # library required to load stata dataset
library(MASS)


# Set Working Directory

setwd("/Users/jslapin/Documents/papers and diss/IO Replication Materials")

# Load stoiber et al data
stoiber <- read.dta("stoiber_thurner_pappi.dta")

# Load and save ep taskforce data
ep <- read.dta("igc15-02_missing.dta")

# compare the two data sets against each other
# create a new matrix, "check", where each element corresponds to a matrix
# in the ep and stoiber matrices. 
# If both stoiber and ep match,       check ==1
# If both stoiber and ep disagee,     check ==0
# NA means one or both are missing

# Create matrices which will contain info about the correspondance of two data sets

check<-matrix(nrow=nrow(ep), ncol=ncol(ep)) 
check1<-matrix(nrow=nrow(ep), ncol=ncol(ep))

check2<-matrix(nrow=nrow(ep), ncol=ncol(ep))
check3<-matrix(nrow=nrow(ep), ncol=ncol(ep))

# This loops over all columns and all rows and checks the correspondence 
# of each element in both datasets. Check is "1" if that element is the 
# same in both data sets. This loop assumes that if Stoiber et al. value
# is within 0.5 of IGC_missing, the two datasets are the same. 
# It produces the file "checkdata"

i<-1
j<-1

for (i in 1:nrow(ep)) {
	for (j in 4:ncol(ep)){
		a<-ep[i,j]
		b<-stoiber[i,j]
		check[i,j]<- ifelse(abs(a-b)<=.5,1,0)
	    check1[i,j]<-ifelse(abs(a-b)>.5,0,check[i,j])
}}

check<-data.frame(check1)
write.csv(check, file="temp/checkdata.csv")

# Print Number Matching Between Datasets

#number of positions missing in both dataset
numbothmissing<-sum(is.na(check))
bothpos<-5016-numbothmissing   #5016 is the total number of positions in both datasets (rows*cols of datasets)
same<- sum(colSums(check,na.rm=TRUE))
percent<-same/bothpos

cat("Number of Positions in Common:",bothpos)
cat("Number of Positions the Same:", same)
cat("Percent Same:",percent)


# Here I create a new data set, "new", with some missing values in 
# EP data filled in using values from STP

new<-matrix(nrow=nrow(ep), ncol=ncol(ep)) 

i<-1
j<-1

for (i in 1:nrow(ep)) {
	for (j in 4:ncol(ep)){
		a<-ep[i,j]
		b<-stoiber[i,j]
		new[i,j]<- ifelse(a=="NA",b,a)
}}

# Save the new data 
ams_ep_stoiber<-data.frame(new)
write.csv(ams_ep_stoiber, file="temp/ams_ep_stoiber.csv")

